package edu.stanford.nlp.patterns.surface;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;
import javax.json.JsonReader;
import javax.json.JsonValue;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.GoldAnswerAnnotation;
import edu.stanford.nlp.patterns.surface.ConstantsAndVariables.ScorePhraseMeasures;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.EditDistance;
import edu.stanford.nlp.util.Execution;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PriorityQueue;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Triple;
import edu.stanford.nlp.util.TypesafeMap;
import edu.stanford.nlp.util.TypesafeMap.Key;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Given text and a seed list, this class gives more words like the seed words
* by learning surface word patterns.
* <p>
*
* The multi-threaded class (<code>nthread</code> parameter for number of
* threads) takes as input.
*
* To use the default options, run
* <p>
* <code>java -mx1000m edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass -file text_file -seedWordsFiles label1,seedwordlist1;label2,seedwordlist2;... -outDir output_directory (optional)</code>
* <p>
*
* <code>fileFormat</code>: (Optional) Default is text. Valid values are text
* (or txt) and ser, where the serialized file is of the type <code>Map<String,
* List<CoreLabel>></code>.
* <p>
* <code>file</code>: (Required) Input file(s) (default assumed text). Can be
* one or more of (concatenated by comma or semi-colon): file, directory, files
* with regex in the filename (for example: "mydir/health-.*-processed.txt")
* <p>
* <code>seedWordsFiles</code>: (Required)
* label1,file_seed_words1;label2,file_seed_words2;... where file_seed_words are
* files with list of seed words, one in each line
* <p>
* <code>outDir</code>: (Optional) output directory where visualization/output
* files are stored
* <p>
* For other flags, see individual comments for each flag.
*
* <p>
* To use a properties file, see
* projects/core/data/edu/stanford/nlp/patterns/surface/example.properties
* as an example for the flags and their brief descriptions. Run the code as:
* <code>java -mx1000m edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass -props projects/core/data/edu/stanford/nlp/patterns/surface/example.properties</code>
*
* <p>
* IMPORTANT: Many flags are described in the classes
* {@link ConstantsAndVariables}, {@link CreatePatterns}, and
* {@link PhraseScorer}.
*
*
*
* @author Sonal Gupta (sonal@cs.stanford.edu)
*/
public class GetPatternsFromDataMultiClass implements Serializable {
private static final long serialVersionUID = 1L;
public Map<String, Map<Integer, Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>>>> patternsForEachToken = null;
public Map<String, Set<String>> wordsForOtherClass = null;
Counter<String> patternsOtherClass = null;
// String channelNameLogger = "patterns";
/**
*
* RlogF is from Riloff 1996, when R's denominator is (pos+neg+unlabeled)
* <p>
* RlogFPosNeg is when the R's denominator is just (pos+negative) examples
* <p>
* PosNegOdds is just the ratio of number of positive words to number of
* negative
* <p>
* PosNegUnlabOdds is just the ratio of number of positive words to number of
* negative (unlabeled words + negative)
* <p>
* RatioAll is pos/(neg+pos+unlabeled)
* <p>
* YanGarber02 is the modified version presented in
* "Unsupervised Learning of Generalized Names"
* <p>
* LOGREG is learning a logisitic regression classifier to combine weights to
* score a phrase (Same as PhEvalInPat, except score of an unlabeled phrase is
* computed using a logistic regression classifier)
* <p>
* LOGREGlogP is learning a logisitic regression classifier to combine weights
* to score a phrase (Same as PhEvalInPatLogP, except score of an unlabeled
* phrase is computed using a logistic regression classifier)
* <p>
* SqrtAllRatio is the pattern scoring used in Gupta et al. JAMIA 2014 paper
* <p>
* Below F1SeedPattern and BPB based on paper
* "Unsupervised Method for Automatics Construction of a disease dictionary..."
* <p>
* Precision, Recall, and FMeasure (controlled by fbeta flag) is ranking the patterns using
* their precision, recall and F_beta measure
*/
public enum PatternScoring {
F1SeedPattern, RlogF, RlogFPosNeg, RlogFUnlabNeg, RlogFNeg, PhEvalInPat, PhEvalInPatLogP, PosNegOdds,
YanGarber02, PosNegUnlabOdds, RatioAll, LOGREG, LOGREGlogP, SqrtAllRatio, LinICML03, kNN, Precision, Recall, FMeasure
}
enum WordScoring {
BPB, WEIGHTEDNORM
}
Map<String, Boolean> writtenPatInJustification = new HashMap<String, Boolean>();
Map<String, Counter<SurfacePattern>> learnedPatterns = new HashMap<String, Counter<SurfacePattern>>();
Map<String, Counter<String>> learnedWords = new HashMap<String, Counter<String>>();
public Map<String, TwoDimensionalCounter<String, SurfacePattern>> wordsPatExtracted = new HashMap<String, TwoDimensionalCounter<String, SurfacePattern>>();
Properties props;
public ScorePhrases scorePhrases;
public ConstantsAndVariables constVars = new ConstantsAndVariables();
public CreatePatterns createPats;
DecimalFormat df = new DecimalFormat("#.##");
/*
* when there is only one label
*/
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Set<String> seedSet, boolean labelUsingSeedSets,
String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException,
NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Set<String> seedSet, boolean labelUsingSeedSets,
Class answerClass, String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<String, Class<? extends TypesafeMap.Key<String>>>();
ansCl.put(answerLabel, answerClass);
Map<String, Class> generalizeClasses = new HashMap<String, Class>();
Map<String, Map<Class, Object>> ignoreClasses = new HashMap<String, Map<Class, Object>>();
ignoreClasses.put(answerLabel, new HashMap<Class, Object>());
Map<String, Set<String>> seedSets = new HashMap<String, Set<String>>();
seedSets.put(answerLabel, seedSet);
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Set<String> seedSet, boolean labelUsingSeedSets,
String answerLabel, Map<String, Class> generalizeClasses, Map<Class, Object> ignoreClasses) throws IOException, InstantiationException,
IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException,
ExecutionException, ClassNotFoundException {
this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Set<String> seedSet, boolean labelUsingSeedSets,
Class answerClass, String answerLabel, Map<String, Class> generalizeClasses, Map<Class, Object> ignoreClasses) throws IOException,
InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException,
InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<String, Class<? extends TypesafeMap.Key<String>>>();
ansCl.put(answerLabel, answerClass);
Map<String, Map<Class, Object>> iC = new HashMap<String, Map<Class, Object>>();
iC.put(answerLabel, ignoreClasses);
Map<String, Set<String>> seedSets = new HashMap<String, Set<String>>();
seedSets.put(answerLabel, seedSet);
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, iC);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Map<String, Set<String>> seedSets,
boolean labelUsingSeedSets) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, ClassNotFoundException, InterruptedException, ExecutionException {
this.props = props;
Map<String, Class<? extends TypesafeMap.Key<String>>> ansCl = new HashMap<String, Class<? extends TypesafeMap.Key<String>>>();
Map<String, Class> gC = new HashMap<String, Class>();
Map<String, Map<Class, Object>> iC = new HashMap<String, Map<Class, Object>>();
int i = 1;
for (String label : seedSets.keySet()) {
String ansclstr = "edu.stanford.nlp.patterns.surface.PatternsAnnotations$PatternLabel" + i;
ansCl.put(label, (Class<? extends Key<String>>) Class.forName(ansclstr));
iC.put(label, new HashMap<Class, Object>());
i++;
}
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, gC, iC);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Map<String, Set<String>> seedSets,
boolean labelUsingSeedSets, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass) throws IOException, InstantiationException,
IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException,
ExecutionException, ClassNotFoundException {
this(props, sents, seedSets, labelUsingSeedSets, answerClass, new HashMap<String, Class>(), new HashMap<String, Map<Class, Object>>());
}
/**
* generalize classes basically maps label strings to a map of generalized
* strings and the corresponding class ignoreClasses have to be boolean
*
* @throws IOException
* @throws SecurityException
* @throws NoSuchMethodException
* @throws InvocationTargetException
* @throws IllegalArgumentException
* @throws IllegalAccessException
* @throws InstantiationException
* @throws ExecutionException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map<String, List<CoreLabel>> sents, Map<String, Set<String>> seedSets,
boolean labelUsingSeedSets, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String, Class> generalizeClasses,
Map<String, Map<Class, Object>> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
if (ignoreClasses.isEmpty()) {
for (String label : seedSets.keySet())
ignoreClasses.put(label, new HashMap<Class, Object>());
}
setUpConstructor(sents, seedSets, labelUsingSeedSets, answerClass, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
private void setUpConstructor(Map<String, List<CoreLabel>> sents, Map<String, Set<String>> seedSets, boolean labelUsingSeedSets,
Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String, Class> generalizeClasses,
Map<String, Map<Class, Object>> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
Data.sents = sents;
Execution.fillOptions(Data.class, props);
Execution.fillOptions(constVars, props);
constVars.answerClass = answerClass;
constVars.ignoreWordswithClassesDuringSelection = ignoreClasses;
constVars.addGeneralizeClasses(generalizeClasses);
constVars.setLabelDictionary(seedSets);
if (constVars.writeMatchedTokensFiles && constVars.batchProcessSents) {
throw new RuntimeException(
"writeMatchedTokensFiles and batchProcessSents cannot be true at the same time (not implemented; also doesn't make sense to save a large sentences json file)");
}
constVars.setUp(props);
if (constVars.debug < 1) {
Redwood.hideChannelsEverywhere(ConstantsAndVariables.minimaldebug);
}
if (constVars.debug < 2) {
Redwood.hideChannelsEverywhere(Redwood.DBG);
}
constVars.justify = true;
if (constVars.debug < 3) {
constVars.justify = false;
}
if (constVars.debug < 4) {
Redwood.hideChannelsEverywhere(ConstantsAndVariables.extremedebug);
}
Redwood.log(Redwood.DBG, "Running with debug output");
Redwood.log(ConstantsAndVariables.extremedebug, "Running with extreme debug output");
wordsPatExtracted = new HashMap<String, TwoDimensionalCounter<String, SurfacePattern>>();
File invIndexDir = null;
boolean createInvIndex = true;
if (constVars.loadInvertedIndexDir != null) {
createInvIndex = false;
constVars.invertedIndex = InvertedIndexByTokens.loadIndex(constVars.loadInvertedIndexDir);
if (constVars.invertedIndex.isBatchProcessed() != constVars.batchProcessSents) {
throw new RuntimeException("The index was created with batchProcessSents as " + constVars.invertedIndex.isBatchProcessed()
+ ". Use the same flag or create a new index");
}
Redwood.log(Redwood.DBG, "Loaded index from " + constVars.loadInvertedIndexDir);
}
// else if(constVars.saveInvertedIndexDir != null){
// if(constVars.diskBackedInvertedIndex){
// invIndexDir = new File(constVars.saveInvertedIndexDir+"/cache");
// IOUtils.deleteDirRecursively(invIndexDir);
// IOUtils.ensureDir(invIndexDir);
// }}
else if (constVars.saveInvertedIndexDir == null) {
String dir = System.getProperty("java.io.tmpdir");
invIndexDir = File.createTempFile(dir, ".dir");
invIndexDir.delete();
invIndexDir.deleteOnExit();
}
Set<String> specialwords4Index = new HashSet<String>();
specialwords4Index.addAll(Arrays.asList("fw", "FW", "sw", "SW", "OTHERSEM", "othersem"));
for (String label : answerClass.keySet()) {
wordsPatExtracted.put(label, new TwoDimensionalCounter<String, SurfacePattern>());
specialwords4Index.add(label);
specialwords4Index.add(label.toLowerCase());
}
scorePhrases = new ScorePhrases(props, constVars);
createPats = new CreatePatterns(props, constVars);
assert !(constVars.doNotApplyPatterns && (createPats.useStopWordsBeforeTerm || constVars.numWordsCompound > 1)) : " Cannot have both doNotApplyPatterns and (useStopWordsBeforeTerm true or numWordsCompound > 1)!";
String prefixFileForIndex = null;
if (constVars.usingDirForSentsInIndex) {
prefixFileForIndex = constVars.saveSentencesSerDir;
}
if (createInvIndex)
constVars.invertedIndex = new InvertedIndexByTokens(invIndexDir, constVars.matchLowerCaseContext, constVars.getStopWords(), specialwords4Index,
constVars.batchProcessSents, prefixFileForIndex);
int totalNumSents = 0;
if (constVars.batchProcessSents) {
if (createInvIndex || labelUsingSeedSets) {
for (File f : Data.sentsFiles) {
Map<String, List<CoreLabel>> sentsf = IOUtils.readObjectFromFile(f);
totalNumSents += sentsf.size();
if (createInvIndex) {
String filename = "";
if (constVars.usingDirForSentsInIndex) {
filename = f.getName();
} else
filename = f.getAbsolutePath();
constVars.invertedIndex.add(sentsf, filename, constVars.useLemmaContextTokens);
}
Redwood.log(Redwood.DBG, "Initializing sents from " + f + " with " + sentsf.size()
+ " sentences, either by labeling with the seed set or just setting the right classes");
for (String l : constVars.answerClass.keySet()) {
Set<String> seed = seedSets == null || !labelUsingSeedSets ? new HashSet<String>() : (seedSets.containsKey(l) ? seedSets.get(l)
: new HashSet<String>());
runLabelSeedWords(sentsf, constVars.answerClass.get(l), l, seed);
Set<String> otherseed = constVars.getOtherSemanticClasses() == null || !labelUsingSeedSets ? new HashSet<String>() : constVars
.getOtherSemanticClasses();
if (constVars.addIndvWordsFromPhrasesExceptLastAsNeg) {
for (String s : seed) {
String[] t = s.split("\\s+");
for (int i = 0; i < t.length - 1; i++) {
if (!seed.contains(t[i])) {
otherseed.add(t[i]);
}
}
}
}
if (constVars.getOtherSemanticClasses() != null)
runLabelSeedWords(sentsf, PatternsAnnotations.OtherSemanticLabel.class, "OTHERSEM", otherseed);
}
Redwood.log(Redwood.DBG, "Saving the labeled seed sents (if given the option) to the same file " + f);
IOUtils.writeObjectToFile(sentsf, f);
}
}
} else {
totalNumSents = Data.sents.size();
if (createInvIndex)
constVars.invertedIndex.add(Data.sents, "1", constVars.useLemmaContextTokens);
Redwood.log(Redwood.DBG, "Initializing sents " + Data.sents.size()
+ " sentences, either by labeling with the seed set or just setting the right classes");
for (String l : constVars.answerClass.keySet()) {
Set<String> seed = seedSets == null || !labelUsingSeedSets ? new HashSet<String>() : (seedSets.containsKey(l) ? seedSets.get(l)
: new HashSet<String>());
runLabelSeedWords(Data.sents, constVars.answerClass.get(l), l, seed);
Set<String> otherseed = constVars.getOtherSemanticClasses() == null || !labelUsingSeedSets ? new HashSet<String>() : constVars
.getOtherSemanticClasses();
if (constVars.getOtherSemanticClasses() != null)
runLabelSeedWords(Data.sents, PatternsAnnotations.OtherSemanticLabel.class, "OTHERSEM", otherseed);
}
}
if (constVars.saveInvertedIndexDir != null) {
IOUtils.ensureDir(new File(constVars.saveInvertedIndexDir));
constVars.invertedIndex.saveIndex(constVars.saveInvertedIndexDir);
}
Redwood.log(Redwood.DBG, "Done creating inverted index of " + constVars.invertedIndex.size() + " tokens and labeling data with total of "
+ totalNumSents + " sentences");
if (constVars.usePatternEvalWordClass || constVars.usePhraseEvalWordClass) {
if (constVars.externalFeatureWeightsFile == null) {
File f = File.createTempFile("tempfeat", ".txt");
f.delete();
f.deleteOnExit();
constVars.externalFeatureWeightsFile = f.getAbsolutePath();
}
for (String label : seedSets.keySet()) {
String externalFeatureWeightsFileLabel = constVars.externalFeatureWeightsFile + "_" + label;
File f = new File(externalFeatureWeightsFileLabel);
if (!f.exists()) {
Redwood.log(Redwood.DBG, "externalweightsfile for the label " + label + " does not exist: learning weights!");
LearnImportantFeatures lmf = new LearnImportantFeatures();
// if (answerClass.size() > 1 || this.labelDictionary.size() > 1)
// throw new RuntimeException("not implemented");
Execution.fillOptions(lmf, props);
lmf.answerClass = answerClass.get(label);
lmf.answerLabel = label;
lmf.setUp();
lmf.getTopFeatures(constVars.batchProcessSents, Data.sentsFiles, Data.sents, constVars.perSelectRand, constVars.perSelectNeg,
externalFeatureWeightsFileLabel);
}
Counter<Integer> distSimWeightsLabel = new ClassicCounter<Integer>();
for (String line : IOUtils.readLines(externalFeatureWeightsFileLabel)) {
String[] t = line.split(":");
if (!t[0].startsWith("Cluster"))
continue;
String s = t[0].replace("Cluster-", "");
Integer clusterNum = Integer.parseInt(s);
distSimWeightsLabel.setCount(clusterNum, Double.parseDouble(t[1]));
}
constVars.distSimWeights.put(label, distSimWeightsLabel);
}
}
// computing semantic odds values
if (constVars.usePatternEvalSemanticOdds || constVars.usePhraseEvalSemanticOdds) {
Counter<String> dictOddsWeightsLabel = new ClassicCounter<String>();
Counter<String> otherSemanticClassFreq = new ClassicCounter<String>();
for (String s : constVars.getOtherSemanticClasses()) {
for (String s1 : StringUtils.getNgrams(Arrays.asList(s.split("\\s+")), 1, constVars.numWordsCompound))
otherSemanticClassFreq.incrementCount(s1);
}
otherSemanticClassFreq = Counters.add(otherSemanticClassFreq, 1.0);
// otherSemanticClassFreq.setDefaultReturnValue(1.0);
Map<String, Counter<String>> labelDictNgram = new HashMap<String, Counter<String>>();
for (String label : seedSets.keySet()) {
Counter<String> classFreq = new ClassicCounter<String>();
for (String s : seedSets.get(label)) {
for (String s1 : StringUtils.getNgrams(Arrays.asList(s.split("\\s+")), 1, constVars.numWordsCompound))
classFreq.incrementCount(s1);
}
classFreq = Counters.add(classFreq, 1.0);
labelDictNgram.put(label, classFreq);
// classFreq.setDefaultReturnValue(1.0);
}
for (String label : seedSets.keySet()) {
Counter<String> otherLabelFreq = new ClassicCounter<String>();
for (String label2 : seedSets.keySet()) {
if (label.equals(label2))
continue;
otherLabelFreq.addAll(labelDictNgram.get(label2));
}
otherLabelFreq.addAll(otherSemanticClassFreq);
dictOddsWeightsLabel = Counters.divisionNonNaN(labelDictNgram.get(label), otherLabelFreq);
constVars.dictOddsWeights.put(label, dictOddsWeightsLabel);
}
}
}
public static Map<String, List<CoreLabel>> runPOSNEROnTokens(List<CoreMap> sentsCM, String posModelPath, boolean useTargetNERRestriction,
String prefix, boolean useTargetParserParentRestriction, String numThreads) {
Annotation doc = new Annotation(sentsCM);
Properties props = new Properties();
List<String> anns = new ArrayList<String>();
anns.add("pos");
anns.add("lemma");
if (useTargetParserParentRestriction) {
anns.add("parse");
}
if (useTargetNERRestriction) {
anns.add("ner");
}
props.setProperty("annotators", StringUtils.join(anns, ","));
props.setProperty("parse.maxlen", "80");
props.setProperty("nthreads", numThreads);
props.setProperty("threads", numThreads);
// props.put( "tokenize.options",
// "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
if (posModelPath != null) {
props.setProperty("pos.model", posModelPath);
}
StanfordCoreNLP pipeline = new StanfordCoreNLP(props, false);
Redwood.log(Redwood.DBG, "Annotating text");
pipeline.annotate(doc);
Redwood.log(Redwood.DBG, "Done annotating text");
Map<String, List<CoreLabel>> sents = new HashMap<String, List<CoreLabel>>();
for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
if (useTargetParserParentRestriction)
inferParentParseTag(s.get(TreeAnnotation.class));
sents.put(prefix + s.get(CoreAnnotations.DocIDAnnotation.class), s.get(CoreAnnotations.TokensAnnotation.class));
}
return sents;
}
static StanfordCoreNLP pipeline = null;
public static int tokenize(String text, String posModelPath, boolean lowercase, boolean useTargetNERRestriction, String sentIDPrefix,
boolean useTargetParserParentRestriction, String numThreads, boolean batchProcessSents, int numMaxSentencesPerBatchFile,
File saveSentencesSerDirFile, Map<String, List<CoreLabel>> sents, int numFilesTillNow) throws InterruptedException, ExecutionException,
IOException {
if (pipeline == null) {
Properties props = new Properties();
List<String> anns = new ArrayList<String>();
anns.add("tokenize");
anns.add("ssplit");
anns.add("pos");
anns.add("lemma");
if (useTargetParserParentRestriction) {
anns.add("parse");
}
if (useTargetNERRestriction) {
anns.add("ner");
}
props.setProperty("annotators", StringUtils.join(anns, ","));
props.setProperty("parse.maxlen", "80");
props.setProperty("threads", numThreads);
props.put("tokenize.options", "ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
if (posModelPath != null) {
props.setProperty("pos.model", posModelPath);
}
pipeline = new StanfordCoreNLP(props);
}
if (lowercase)
text = text.toLowerCase();
Annotation doc = new Annotation(text);
pipeline.annotate(doc);
Redwood.log(Redwood.DBG, "Done annotating text");
int i = -1;
for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
i++;
if (useTargetParserParentRestriction)
inferParentParseTag(s.get(TreeAnnotation.class));
sents.put(sentIDPrefix + i, s.get(CoreAnnotations.TokensAnnotation.class));
if (batchProcessSents && sents.size() >= numMaxSentencesPerBatchFile) {
numFilesTillNow++;
File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow);
IOUtils.writeObjectToFile(sents, file);
sents = new HashMap<String, List<CoreLabel>>();
Data.sentsFiles.add(file);
}
}
if (sents.size() > 0 && batchProcessSents) {
numFilesTillNow++;
File file = new File(saveSentencesSerDirFile + "/sents_" + numFilesTillNow);
IOUtils.writeObjectToFile(sents, file);
Data.sentsFiles.add(file);
sents.clear();
}
// not lugging around sents if batch processing
if (batchProcessSents)
sents = null;
return numFilesTillNow;
}
static void inferParentParseTag(Tree tree) {
String grandstr = tree.value();
for (Tree child : tree.children()) {
for (Tree grand : child.children()) {
if (grand.isLeaf()) {
((CoreLabel) grand.label()).set(CoreAnnotations.GrandparentAnnotation.class, grandstr);
}
}
inferParentParseTag(child);
}
}
/**
* If l1 is a part of l2, it finds the starting index of l1 in l2 If l1 is not
* a sub-array of l2, then it returns -1 note that l2 should have the exact
* elements and order as in l1
*
* @param l1
* array you want to find in l2
* @param l2
* @return starting index of the sublist
*/
public static List<Integer> getSubListIndex(String[] l1, String[] l2, String[] subl2, Set<String> englishWords, HashSet<String> seenFuzzyMatches,
int minLen4Fuzzy) {
if (l1.length > l2.length)
return null;
EditDistance editDistance = new EditDistance(true);
List<Integer> allIndices = new ArrayList<Integer>();
boolean matched = false;
int index = -1;
int lastUnmatchedIndex = 0;
for (int i = 0; i < l2.length;) {
for (int j = 0; j < l1.length;) {
boolean d1 = false, d2 = false;
boolean compareFuzzy = true;
if (englishWords.contains(l2[i]) || englishWords.contains(subl2[i]) || l2[i].length() <= minLen4Fuzzy || subl2[i].length() <= minLen4Fuzzy)
compareFuzzy = false;
if (compareFuzzy == false || l1[j].length() <= minLen4Fuzzy) {
d1 = l1[j].equals(l2[i]) ? true : false;
if (!d1)
d2 = subl2[i].equals(l1[j]) ? true : false;
} else {
String combo = l1[j] + "#" + l2[i];
if (l1[j].equals(l2[i]) || seenFuzzyMatches.contains(combo))
d1 = true;
else {
d1 = editDistance.score(l1[j], l2[i]) <= 1;
if (!d1) {
String combo2 = l1[j] + "#" + subl2[i];
if (l1[j].equals(subl2[i]) || seenFuzzyMatches.contains(combo2))
d2 = true;
else {
d2 = editDistance.score(l1[j], subl2[i]) <= 1;
if (d2) {
// System.out.println(l1[j] + " matched with " + subl2[i]);
seenFuzzyMatches.add(combo2);
}
}
} else if (d1) {
// System.out.println(l1[j] + " matched with " + l2[i]);
seenFuzzyMatches.add(combo);
}
}
}
// if (l1[j].equals(l2[i]) || subl2[i].equals(l1[j])) {
if (d1 || d2) {
index = i;
i++;
j++;
if (j == l1.length) {
matched = true;
break;
}
} else {
j = 0;
i = lastUnmatchedIndex + 1;
lastUnmatchedIndex = i;
index = -1;
if (lastUnmatchedIndex == l2.length)
break;
}
if (i >= l2.length) {
index = -1;
break;
}
}
if (i == l2.length || matched) {
if (index >= 0)
// index = index - l1.length + 1;
allIndices.add(index - l1.length + 1);
matched = false;
lastUnmatchedIndex = index;
// break;
}
}
// get starting point
return allIndices;
}
public void runLabelSeedWords(Map<String, List<CoreLabel>> sents, Class answerclass, String label, Set<String> seedWords)
throws InterruptedException, ExecutionException, IOException {
List<String> keyset = new ArrayList<String>(sents.keySet());
int num = 0;
if (constVars.numThreads == 1)
num = keyset.size();
else
num = keyset.size() / (constVars.numThreads - 1);
ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
Redwood.log(ConstantsAndVariables.extremedebug, "keyset size is " + keyset.size());
List<Future<Map<String, List<CoreLabel>>>> list = new ArrayList<Future<Map<String, List<CoreLabel>>>>();
for (int i = 0; i < constVars.numThreads; i++) {
List<String> keys = keyset.subList(i * num, Math.min(keyset.size(), (i + 1) * num));
Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num + " till " + Math.min(keyset.size(), (i + 1) * num));
Callable<Map<String, List<CoreLabel>>> task = new LabelWithSeedWords(seedWords, sents, keys, answerclass, label);
Future<Map<String, List<CoreLabel>>> submit = executor.submit(task);
list.add(submit);
}
// Now retrieve the result
for (Future<Map<String, List<CoreLabel>>> future : list) {
try {
sents.putAll(future.get());
} catch (Exception e) {
executor.shutdownNow();
throw new RuntimeException(e);
}
}
executor.shutdown();
}
@SuppressWarnings("rawtypes")
public class LabelWithSeedWords implements Callable<Map<String, List<CoreLabel>>> {
Set<String[]> seedwordsTokens = new HashSet<String[]>();
Map<String, List<CoreLabel>> sents;
List<String> keyset;
Class labelClass;
HashSet<String> seenFuzzyMatches = new HashSet<String>();
String label;
public LabelWithSeedWords(Set<String> seedwords, Map<String, List<CoreLabel>> sents, List<String> keyset, Class labelclass, String label) {
for (String s : seedwords)
this.seedwordsTokens.add(s.split("\\s+"));
this.sents = sents;
this.keyset = keyset;
this.labelClass = labelclass;
this.label = label;
}
@SuppressWarnings("unchecked")
@Override
public Map<String, List<CoreLabel>> call() throws Exception {
Map<String, List<CoreLabel>> newsent = new HashMap<String, List<CoreLabel>>();
for (String k : keyset) {
List<CoreLabel> sent = sents.get(k);
String[] tokens = new String[sent.size()];
String[] tokenslemma = new String[sent.size()];
int num = 0;
for (CoreLabel l : sent) {
tokens[num] = l.word();
if (l.lemma() == null)
throw new RuntimeException("how come lemma is null");
tokenslemma[num] = l.lemma();
num++;
}
boolean[] labels = new boolean[tokens.length];
CollectionValuedMap<Integer, String> matchedPhrases = new CollectionValuedMap<Integer, String>();
for (String[] s : seedwordsTokens) {
List<Integer> indices = getSubListIndex(s, tokens, tokenslemma, constVars.getEnglishWords(), seenFuzzyMatches,
constVars.minLen4FuzzyForPattern);
if (indices != null && !indices.isEmpty())
for (int index : indices)
for (int i = 0; i < s.length; i++) {
matchedPhrases.add(index + i, StringUtils.join(s, " "));
labels[index + i] = true;
}
}
int i = -1;
for (CoreLabel l : sent) {
i++;
if (labels[i]) {
l.set(labelClass, label);
Redwood.log(ConstantsAndVariables.extremedebug, "labeling " + l.word() + " or its lemma " + l.lemma() + " as " + label
+ " because of the dict phrases " + (Set<String>) matchedPhrases.get(i));
} else
l.set(labelClass, constVars.backgroundSymbol);
if (!l.containsKey(PatternsAnnotations.MatchedPhrases.class))
l.set(PatternsAnnotations.MatchedPhrases.class, new HashSet<String>());
l.get(PatternsAnnotations.MatchedPhrases.class).addAll(matchedPhrases.get(i));
}
newsent.put(k, sent);
}
return newsent;
}
}
public Map<String, TwoDimensionalCounter<SurfacePattern, String>> patternsandWords = null;
public Map<String, TwoDimensionalCounter<SurfacePattern, String>> allPatternsandWords = null;
public Map<String, Counter<SurfacePattern>> currentPatternWeights = null;
@SuppressWarnings({ "unchecked" })
public Counter<SurfacePattern> getPatterns(String label, Set<SurfacePattern> alreadyIdentifiedPatterns, SurfacePattern p0, Counter<String> p0Set,
Set<SurfacePattern> ignorePatterns) throws InterruptedException, ExecutionException, IOException, ClassNotFoundException,
InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
TwoDimensionalCounter<SurfacePattern, String> patternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
TwoDimensionalCounter<SurfacePattern, String> negPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
TwoDimensionalCounter<SurfacePattern, String> posnegPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
TwoDimensionalCounter<SurfacePattern, String> unLabeledPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
TwoDimensionalCounter<SurfacePattern, String> negandUnLabeledPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
TwoDimensionalCounter<SurfacePattern, String> allPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern, String>();
if (!constVars.batchProcessSents) {
// if not batch processing
if (this.patternsForEachToken == null) {
// if patterns for each token null
if (constVars.computeAllPatterns) {
Redwood.log(Redwood.DBG, "Computing all patterns");
this.patternsForEachToken = createPats.getAllPatterns(label, Data.sents);
constVars.computeAllPatterns =false;
} else {
// read from the saved file
this.patternsForEachToken = IOUtils.readObjectFromFile(constVars.allPatternsFile);
Redwood.log(ConstantsAndVariables.minimaldebug, "Read all patterns from " + constVars.allPatternsFile);
}
}
this.calculateSufficientStats(Data.sents, patternsForEachToken, label, patternsandWords4Label, posnegPatternsandWords4Label,
allPatternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label);
}
// batch processing sentences
else {
for (File f : Data.sentsFiles) {
Redwood.log(Redwood.DBG, (constVars.computeAllPatterns ? "Creating patterns and " : "") + "calculating sufficient statistics from " + f);
Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
Map<String, Map<Integer, Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>>>> pats4File = null;
if (constVars.computeAllPatterns) {
if (this.patternsForEachToken == null)
this.patternsForEachToken = new HashMap<String, Map<Integer, Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>>>>();
pats4File = createPats.getAllPatterns(label, sents);
this.patternsForEachToken.putAll(pats4File);
} else {
if (this.patternsForEachToken == null) {
// read only for the first time
this.patternsForEachToken = IOUtils.readObjectFromFile(constVars.allPatternsFile);
Redwood.log(ConstantsAndVariables.minimaldebug, "Read all patterns from " + constVars.allPatternsFile);
}
pats4File = this.patternsForEachToken;
}
this.calculateSufficientStats(sents, pats4File, label, patternsandWords4Label, posnegPatternsandWords4Label, allPatternsandWords4Label,
negPatternsandWords4Label, unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label);
}
}
if (constVars.computeAllPatterns && constVars.allPatternsFile != null) {
IOUtils.writeObjectToFile(this.patternsForEachToken, constVars.allPatternsFile);
}
if (patternsandWords == null)
patternsandWords = new HashMap<String, TwoDimensionalCounter<SurfacePattern, String>>();
if (allPatternsandWords == null)
allPatternsandWords = new HashMap<String, TwoDimensionalCounter<SurfacePattern, String>>();
if (currentPatternWeights == null)
currentPatternWeights = new HashMap<String, Counter<SurfacePattern>>();
Counter<SurfacePattern> currentPatternWeights4Label = new ClassicCounter<SurfacePattern>();
Set<SurfacePattern> removePats = enforceMinSupportRequirements(patternsandWords4Label, unLabeledPatternsandWords4Label);
Counters.removeKeys(patternsandWords4Label, removePats);
Counters.removeKeys(unLabeledPatternsandWords4Label, removePats);
Counters.removeKeys(negandUnLabeledPatternsandWords4Label, removePats);
Counters.removeKeys(allPatternsandWords4Label, removePats);
Counters.removeKeys(posnegPatternsandWords4Label, removePats);
Counters.removeKeys(negPatternsandWords4Label, removePats);
// Redwood.log(ConstantsAndVariables.extremedebug,
// "Patterns around positive words in the label " + label + " are " +
// patternsandWords4Label);
ScorePatterns scorePatterns;
Class<?> patternscoringclass = getPatternScoringClass(constVars.patternScoring);
if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsF1.class)) {
scorePatterns = new ScorePatternsF1(constVars, constVars.patternScoring, label, patternsandWords4Label, negPatternsandWords4Label,
unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label, allPatternsandWords4Label, props, p0Set, p0);
Counter<SurfacePattern> finalPat = scorePatterns.score();
Counters.removeKeys(finalPat, alreadyIdentifiedPatterns);
Counters.retainNonZeros(finalPat);
Counters.retainTop(finalPat, 1);
if (Double.isNaN(Counters.max(finalPat)))
throw new RuntimeException("how is the value NaN");
Redwood.log(ConstantsAndVariables.minimaldebug, "Selected Pattern: " + finalPat);
return finalPat;
} else if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsRatioModifiedFreq.class)) {
scorePatterns = new ScorePatternsRatioModifiedFreq(constVars, constVars.patternScoring, label, patternsandWords4Label,
negPatternsandWords4Label, unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label, allPatternsandWords4Label,
phInPatScoresCache, scorePhrases, props);
} else if (patternscoringclass != null && patternscoringclass.equals(ScorePatternsFreqBased.class)) {
scorePatterns = new ScorePatternsFreqBased(constVars, constVars.patternScoring, label, patternsandWords4Label, negPatternsandWords4Label,
unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label, allPatternsandWords4Label, props);
} else if (constVars.patternScoring.equals(PatternScoring.kNN)) {
try {
Class<? extends ScorePatterns> clazz = (Class<? extends ScorePatterns>) Class.forName("edu.stanford.nlp.patterns.surface.ScorePatternsKNN");
Constructor<? extends ScorePatterns> ctor = clazz.getConstructor(ConstantsAndVariables.class, PatternScoring.class, String.class,
TwoDimensionalCounter.class, TwoDimensionalCounter.class, TwoDimensionalCounter.class, TwoDimensionalCounter.class,
TwoDimensionalCounter.class, ScorePhrases.class, Properties.class);
scorePatterns = ctor.newInstance(constVars, constVars.patternScoring, label, patternsandWords4Label, negPatternsandWords4Label,
unLabeledPatternsandWords4Label, negandUnLabeledPatternsandWords4Label, allPatternsandWords4Label, scorePhrases, props);
} catch (ClassNotFoundException e) {
throw new RuntimeException("kNN pattern scoring is not released yet. Stay tuned.");
} catch (NoSuchMethodException e) {
throw new RuntimeException("newinstance of kNN not created", e);
} catch (InvocationTargetException e) {
throw new RuntimeException("newinstance of kNN not created", e);
} catch (IllegalAccessException e) {
throw new RuntimeException("newinstance of kNN not created", e);
} catch (InstantiationException e) {
throw new RuntimeException("newinstance of kNN not created", e);
}
} else {
throw new RuntimeException(constVars.patternScoring + " is not implemented (check spelling?). ");
}
scorePatterns.setUp(props);
currentPatternWeights4Label = scorePatterns.score();
Redwood.log(ConstantsAndVariables.extremedebug, "patterns counter size is " + currentPatternWeights4Label.size());
if (ignorePatterns != null && !ignorePatterns.isEmpty()) {
Counters.removeKeys(currentPatternWeights4Label, ignorePatterns);
Redwood.log(ConstantsAndVariables.extremedebug, "Removing patterns from ignorePatterns of size " + ignorePatterns.size()
+ ". New patterns size " + currentPatternWeights4Label.size());
}
if (alreadyIdentifiedPatterns != null && !alreadyIdentifiedPatterns.isEmpty()) {
Counters.removeKeys(currentPatternWeights4Label, alreadyIdentifiedPatterns);
Redwood.log(ConstantsAndVariables.extremedebug, "Removing already identified patterns of size " + alreadyIdentifiedPatterns.size()
+ ". New patterns size " + currentPatternWeights4Label.size());
}
PriorityQueue<SurfacePattern> q = Counters.toPriorityQueue(currentPatternWeights4Label);
int num = 0;
Counter<SurfacePattern> chosenPat = new ClassicCounter<SurfacePattern>();
Set<SurfacePattern> removePatterns = new HashSet<SurfacePattern>();
Set<SurfacePattern> removeIdentifiedPatterns = null;
while (num < constVars.numPatterns && !q.isEmpty()) {
SurfacePattern pat = q.removeFirst();
if (currentPatternWeights4Label.getCount(pat) < constVars.thresholdSelectPattern) {
Redwood.log(Redwood.DBG, "The max weight of candidate patterns is " + df.format(currentPatternWeights4Label.getCount(pat))
+ " so not adding anymore patterns");
break;
}
boolean notchoose = false;
if (!unLabeledPatternsandWords4Label.containsFirstKey(pat) || unLabeledPatternsandWords4Label.getCounter(pat).isEmpty()) {
Redwood.log(ConstantsAndVariables.extremedebug, "Removing pattern " + pat + " because it has no unlab support; pos words: "
+ patternsandWords4Label.getCounter(pat) + " and all words " + allPatternsandWords4Label.getCounter(pat));
notchoose = true;
continue;
}
Set<SurfacePattern> removeChosenPats = null;
if (!notchoose) {
if (alreadyIdentifiedPatterns != null) {
for (SurfacePattern p : alreadyIdentifiedPatterns) {
if (SurfacePattern.subsumes(pat, p)) {
// if (pat.getNextContextStr().contains(p.getNextContextStr()) &&
// pat.getPrevContextStr().contains(p.getPrevContextStr())) {
Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + pat
+ " because it is contained in or contains the already chosen pattern " + p);
notchoose = true;
break;
}
int rest = pat.equalContext(p);
// the contexts dont match
if (rest == Integer.MAX_VALUE)
continue;
// if pat is less restrictive, remove p and add pat!
if (rest < 0) {
if(removeIdentifiedPatterns == null)
removeIdentifiedPatterns = new HashSet<SurfacePattern>();
removeIdentifiedPatterns.add(p);
} else {
notchoose = true;
break;
}
}
}
}
// In this iteration:
if (!notchoose) {
for (SurfacePattern p : chosenPat.keySet()) {
boolean removeChosenPatFlag = false;
if (SurfacePattern.sameGenre(pat, p)) {
if(SurfacePattern.subsumes(pat, p)){
Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + pat
+ " because it is contained in or contains the already chosen pattern " + p);
notchoose = true;
break;
}
else if (SurfacePattern.subsumes(p, pat)) {
//subsume is true even if equal context
//check if equal context
int rest = pat.equalContext(p);
// the contexts do not match
if (rest == Integer.MAX_VALUE)
{
Redwood.log(ConstantsAndVariables.extremedebug, "Not choosing pattern " + p
+ " because it is contained in or contains another chosen pattern in this iteration " + pat);
removeChosenPatFlag = true;
}
// if pat is less restrictive, remove p from chosen patterns and
// add pat!
else if (rest < 0) {
removeChosenPatFlag = true;
} else {
notchoose = true;
break;
}
}
if (removeChosenPatFlag) {
if(removeChosenPats == null)
removeChosenPats = new HashSet<SurfacePattern>();
removeChosenPats.add(p);
num--;
}
}
}
}
if (notchoose) {
Redwood.log(Redwood.DBG, "Not choosing " + pat + " for whatever reason!");
continue;
}
if (removeChosenPats != null) {
Redwood.log(ConstantsAndVariables.extremedebug, "Removing already chosen patterns in this iteration " + removeChosenPats + " in favor of "
+ pat);
Counters.removeKeys(chosenPat, removeChosenPats);
}
if (removeIdentifiedPatterns != null) {
Redwood.log(ConstantsAndVariables.extremedebug, "Removing already identified patterns " + removeIdentifiedPatterns + " in favor of " + pat);
removePatterns.addAll(removeIdentifiedPatterns);
}
chosenPat.setCount(pat, currentPatternWeights4Label.getCount(pat));
num++;
}
this.removeLearnedPatterns(label, removePatterns);
Redwood.log(Redwood.DBG, "final size of the patterns is " + chosenPat.size());
Redwood.log(ConstantsAndVariables.minimaldebug, "## Selected Patterns ## \n");
List<Pair<SurfacePattern, Double>> chosenPatSorted = Counters.toSortedListWithCounts(chosenPat);
for (Pair<SurfacePattern, Double> en : chosenPatSorted)
Redwood.log(ConstantsAndVariables.minimaldebug, en.first().toStringToWrite() + ":" + df.format(en.second) + "\n");
if (constVars.outDir != null && !constVars.outDir.isEmpty()) {
CollectionValuedMap<SurfacePattern, String> posWords = new CollectionValuedMap<SurfacePattern, String>();
for (Entry<SurfacePattern, ClassicCounter<String>> en : patternsandWords4Label.entrySet()) {
posWords.addAll(en.getKey(), en.getValue().keySet());
}
CollectionValuedMap<SurfacePattern, String> negWords = new CollectionValuedMap<SurfacePattern, String>();
for (Entry<SurfacePattern, ClassicCounter<String>> en : negPatternsandWords4Label.entrySet()) {
negWords.addAll(en.getKey(), en.getValue().keySet());
}
CollectionValuedMap<SurfacePattern, String> unlabWords = new CollectionValuedMap<SurfacePattern, String>();
for (Entry<SurfacePattern, ClassicCounter<String>> en : unLabeledPatternsandWords4Label.entrySet()) {
unlabWords.addAll(en.getKey(), en.getValue().keySet());
}
String outputdir = constVars.outDir + "/" + constVars.identifier + "/" + label;
Redwood.log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
IOUtils.ensureDir(new File(outputdir));
String filename = outputdir + "/patterns" + ".json";
JsonArrayBuilder obj = Json.createArrayBuilder();
if (writtenPatInJustification.containsKey(label) && writtenPatInJustification.get(label)) {
JsonReader jsonReader = Json.createReader(new BufferedInputStream(new FileInputStream(filename)));
JsonArray objarr = jsonReader.readArray();
jsonReader.close();
for (JsonValue o : objarr)
obj.add(o);
} else
obj = Json.createArrayBuilder();
JsonObjectBuilder objThisIter = Json.createObjectBuilder();
for (Pair<SurfacePattern, Double> pat : chosenPatSorted) {
JsonObjectBuilder o = Json.createObjectBuilder();
JsonArrayBuilder pos = Json.createArrayBuilder();
JsonArrayBuilder neg = Json.createArrayBuilder();
JsonArrayBuilder unlab = Json.createArrayBuilder();
for (String w : posWords.get(pat.first()))
pos.add(w);
for (String w : negWords.get(pat.first()))
neg.add(w);
for (String w : unlabWords.get(pat.first()))
unlab.add(w);
o.add("Positive", pos);
o.add("Negative", neg);
o.add("Unlabeled", unlab);
o.add("Score", pat.second());
objThisIter.add(pat.first().toStringSimple(), o);
}
obj.add(objThisIter.build());
IOUtils.ensureDir(new File(filename).getParentFile());
IOUtils.writeStringToFile(obj.build().toString(), filename, "utf8");
writtenPatInJustification.put(label, true);
}
if (constVars.justify) {
Redwood.log(Redwood.DBG, "Justification for Patterns:");
for (SurfacePattern key : chosenPat.keySet()) {
Redwood.log(Redwood.DBG, "\nPattern: " + key.toStringToWrite());
Redwood.log(
Redwood.DBG,
"Positive Words:"
+ Counters.toSortedString(patternsandWords4Label.getCounter(key), patternsandWords4Label.getCounter(key).size(), "%1$s:%2$f", ";"));
Redwood.log(
Redwood.DBG,
"Negative Words:"
+ Counters.toSortedString(negPatternsandWords4Label.getCounter(key), negPatternsandWords4Label.getCounter(key).size(), "%1$s:%2$f",
";"));
Redwood.log(
Redwood.DBG,
"Unlabeled Words:"
+ Counters.toSortedString(unLabeledPatternsandWords4Label.getCounter(key), unLabeledPatternsandWords4Label.getCounter(key).size(),
"%1$s:%2$f", ";"));
}
}
allPatternsandWords.put(label, allPatternsandWords4Label);
patternsandWords.put(label, patternsandWords4Label);
currentPatternWeights.put(label, currentPatternWeights4Label);
return chosenPat;
}
public static Class getPatternScoringClass(PatternScoring patternScoring) {
if (patternScoring.equals(PatternScoring.F1SeedPattern)) {
return ScorePatternsF1.class;
} else if (patternScoring.equals(PatternScoring.PosNegUnlabOdds) || patternScoring.equals(PatternScoring.PosNegOdds)
|| patternScoring.equals(PatternScoring.RatioAll) || patternScoring.equals(PatternScoring.PhEvalInPat)
|| patternScoring.equals(PatternScoring.PhEvalInPatLogP) || patternScoring.equals(PatternScoring.LOGREG)
|| patternScoring.equals(PatternScoring.LOGREGlogP) || patternScoring.equals(PatternScoring.SqrtAllRatio)) {
return ScorePatternsRatioModifiedFreq.class;
} else if (patternScoring.equals(PatternScoring.RlogF) || patternScoring.equals(PatternScoring.RlogFPosNeg)
|| patternScoring.equals(PatternScoring.RlogFUnlabNeg) || patternScoring.equals(PatternScoring.RlogFNeg)
|| patternScoring.equals(PatternScoring.YanGarber02) || patternScoring.equals(PatternScoring.LinICML03)) {
return ScorePatternsFreqBased.class;
} else {
return null;
}
}
private void calculateSufficientStats(Map<String, List<CoreLabel>> sents,
Map<String, Map<Integer, Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>>>> patternsForEachToken, String label,
TwoDimensionalCounter<SurfacePattern, String> patternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> posnegPatternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> allPatternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> negPatternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> unLabeledPatternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> negandUnLabeledPatternsandWords4Label) {
// calculating the sufficient statistics
Class answerClass4Label = constVars.answerClass.get(label);
for (Entry<String, List<CoreLabel>> sentEn : sents.entrySet()) {
Map<Integer, Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>>> pat4Sent = patternsForEachToken.get(sentEn.getKey());
if (pat4Sent == null) {
throw new RuntimeException("How come there are no patterns for " + sentEn.getKey() + ". The total patternsForEachToken size is "
+ patternsForEachToken.size() + " and keys " + patternsForEachToken.keySet());
}
List<CoreLabel> sent = sentEn.getValue();
for (int i = 0; i < sent.size(); i++) {
CoreLabel token = sent.get(i);
Set<String> matchedPhrases = token.get(PatternsAnnotations.MatchedPhrases.class);
String tokenWordOrLemma = token.word();
String longestMatchingPhrase = null;
if (constVars.useMatchingPhrase) {
if (matchedPhrases != null && !matchedPhrases.isEmpty()) {
for (String s : matchedPhrases) {
if (s.equals(tokenWordOrLemma)) {
longestMatchingPhrase = tokenWordOrLemma;
break;
}
if (longestMatchingPhrase == null || longestMatchingPhrase.length() > s.length()) {
longestMatchingPhrase = s;
}
}
} else {
longestMatchingPhrase = tokenWordOrLemma;
}
} else
longestMatchingPhrase = tokenWordOrLemma;
Triple<Set<SurfacePattern>, Set<SurfacePattern>, Set<SurfacePattern>> pat = pat4Sent.get(i);
if (pat == null)
throw new RuntimeException("Why are patterns null for sentence " + sentEn.getKey() + " and token " + i);
Set<SurfacePattern> prevPat = pat.first();
Set<SurfacePattern> nextPat = pat.second();
Set<SurfacePattern> prevnextPat = pat.third();
if (constVars.ignoreWordRegex.matcher(token.word()).matches())
continue;
// if the target word/phrase does not satisfy the POS requirement
String tag = token.tag();
if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.containsKey(label)) {
boolean use = false;
for (String allowed : constVars.allowedTagsInitials.get(label)) {
if (tag.startsWith(allowed)) {
use = true;
break;
}
}
if (!use)
continue;
}
// if the target word/phrase does not satisfy the NER requirements
String nertag = token.ner();
if (constVars.allowedNERsforLabels != null && constVars.allowedNERsforLabels.containsKey(label)) {
if (!constVars.allowedNERsforLabels.get(label).contains(nertag)) {
continue;
}
}
if (token.get(answerClass4Label).equals(label)) {
// Positive
boolean prevTokenLabel = i == 0 ? false : sent.get(i - 1).get(answerClass4Label).equals(label);
boolean nextTokenLabel = i == sent.size() - 1 ? false : sent.get(i + 1).get(answerClass4Label).equals(label);
if (!constVars.ignorePatWithLabeledNeigh || !prevTokenLabel) {
for (SurfacePattern s : prevPat) {
patternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
posnegPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
allPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
}
}
if (!constVars.ignorePatWithLabeledNeigh || !nextTokenLabel) {
for (SurfacePattern s : nextPat) {
patternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
posnegPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
allPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
}
}
if (!constVars.ignorePatWithLabeledNeigh || (!prevTokenLabel && !nextTokenLabel)) {
for (SurfacePattern s : prevnextPat) {
patternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
posnegPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
allPatternsandWords4Label.getCounter(s).incrementCount(longestMatchingPhrase);
}
}
} else {
// Negative or unlabeled
boolean negToken = false;
Map<Class, Object> ignore = constVars.ignoreWordswithClassesDuringSelection.get(label);
for (Class igCl : ignore.keySet())
if ((Boolean) token.get(igCl)) {
negToken = true;
break;
}
if (!negToken)
if (constVars.getOtherSemanticClasses().contains(token.word()) || constVars.getOtherSemanticClasses().contains(token.lemma()))
negToken = true;
for (SurfacePattern s : CollectionUtils.union(CollectionUtils.union(prevPat, nextPat), prevnextPat)) {
if (negToken) {
negPatternsandWords4Label.getCounter(s).incrementCount(tokenWordOrLemma);
posnegPatternsandWords4Label.getCounter(s).incrementCount(tokenWordOrLemma);
} else {
unLabeledPatternsandWords4Label.getCounter(s).incrementCount(tokenWordOrLemma);
}
negandUnLabeledPatternsandWords4Label.getCounter(s).incrementCount(tokenWordOrLemma);
allPatternsandWords4Label.incrementCount(s, tokenWordOrLemma);
}
}
}
}
}
private Set<SurfacePattern> enforceMinSupportRequirements(TwoDimensionalCounter<SurfacePattern, String> patternsandWords4Label,
TwoDimensionalCounter<SurfacePattern, String> unLabeledPatternsandWords4Label) {
Set<SurfacePattern> remove = new HashSet<SurfacePattern>();
for (Entry<SurfacePattern, ClassicCounter<String>> en : patternsandWords4Label.entrySet()) {
if (en.getValue().size() < constVars.minPosPhraseSupportForPat) {
remove.add(en.getKey());
}
}
int numRemoved = remove.size();
Redwood.log(Redwood.DBG, "Removing " + numRemoved + " patterns that do not meet minPosPhraseSupportForPat requirement of >= "
+ constVars.minPosPhraseSupportForPat);
for (Entry<SurfacePattern, ClassicCounter<String>> en : unLabeledPatternsandWords4Label.entrySet()) {
if (en.getValue().size() < constVars.minUnlabPhraseSupportForPat) {
remove.add(en.getKey());
}
}
Redwood.log(Redwood.DBG, "Removing " + (remove.size() - numRemoved) + " patterns that do not meet minUnlabPhraseSupportForPat requirement of >= "
+ constVars.minUnlabPhraseSupportForPat);
return remove;
}
void removeLearnedPattern(String label, SurfacePattern p) {
this.learnedPatterns.get(label).remove(p);
if (wordsPatExtracted.containsKey(label))
for (Entry<String, ClassicCounter<SurfacePattern>> en : this.wordsPatExtracted.get(label).entrySet()) {
en.getValue().remove(p);
}
}
void removeLearnedPatterns(String label, Collection<SurfacePattern> pats) {
Counters.removeKeys(this.learnedPatterns.get(label), pats);
if (wordsPatExtracted.containsKey(label))
for (Entry<String, ClassicCounter<SurfacePattern>> en : this.wordsPatExtracted.get(label).entrySet()) {
Counters.removeKeys(en.getValue(), pats);
}
}
public static Counter<String> normalizeSoftMaxMinMaxScores(Counter<String> scores, boolean minMaxNorm, boolean softmax, boolean oneMinusSoftMax) {
double minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE;
Counter<String> newscores = new ClassicCounter<String>();
if (softmax) {
for (Entry<String, Double> en : scores.entrySet()) {
Double score = null;
if (oneMinusSoftMax)
score = (1 / (1 + Math.exp(Math.min(7, en.getValue()))));
else
score = (1 / (1 + Math.exp(-1 * Math.min(7, en.getValue()))));
if (score < minScore)
minScore = score;
if (score > maxScore)
maxScore = score;
newscores.setCount(en.getKey(), score);
}
} else {
newscores.addAll(scores);
minScore = Counters.min(newscores);
maxScore = Counters.max(newscores);
}
if (minMaxNorm) {
for (Entry<String, Double> en : newscores.entrySet()) {
double score;
if (minScore == maxScore)
score = minScore;
else
score = (en.getValue() - minScore + 1e-10) / (maxScore - minScore);
newscores.setCount(en.getKey(), score);
}
}
return newscores;
}
public TwoDimensionalCounter<String, ScorePhraseMeasures> phInPatScoresCache = new TwoDimensionalCounter<String, ScorePhraseMeasures>();
// TODO: this right now doesn't work for matchPatterns because of
// DictAnnotationDTorSC. we are not setting DT, SC thing in the test sentences
// Update: (may be this comment is not relevant anymore.)
public void labelWords(String label, Map<String, List<CoreLabel>> sents, Set<String> identifiedWords, Set<SurfacePattern> patterns, String outFile,
CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>> matchedTokensByPat) throws IOException {
CollectionValuedMap<String, Integer> tokensMatchedPatterns = null;
if (constVars.restrictToMatched) {
tokensMatchedPatterns = new CollectionValuedMap<String, Integer>();
for (Entry<SurfacePattern, Collection<Triple<String, Integer, Integer>>> en : matchedTokensByPat.entrySet()) {
for (Triple<String, Integer, Integer> en2 : en.getValue()) {
for (int i = en2.second(); i <= en2.third(); i++) {
tokensMatchedPatterns.add(en2.first(), i);
}
}
}
}
for (Entry<String, List<CoreLabel>> sentEn : sents.entrySet()) {
Set<String[]> identifiedWordsTokens = new HashSet<String[]>();
for (String s : identifiedWords) {
String[] toks = s.split("\\s+");
identifiedWordsTokens.add(toks);
}
String[] sent = new String[sentEn.getValue().size()];
int i = 0;
Set<Integer> contextWordsRecalculatePats = new HashSet<Integer>();
for (CoreLabel l : sentEn.getValue()) {
sent[i] = l.word();
i++;
}
for (String[] ph : identifiedWordsTokens) {
List<Integer> ints = ArrayUtils.getSubListIndex(ph, sent);
if (ints == null)
continue;
for (Integer idx : ints) {
boolean donotuse = false;
if (constVars.restrictToMatched) {
for (int j = 0; j < ph.length; j++) {
if (!tokensMatchedPatterns.get(sentEn.getKey()).contains(idx + j)) {
Redwood.log(ConstantsAndVariables.extremedebug, "not labeling " + sentEn.getValue().get(idx + j).word());
donotuse = true;
break;
}
}
}
if (donotuse == false) {
for (int j = 0; j < ph.length; j++) {
int index = idx + j;
CoreLabel l = sentEn.getValue().get(index);
if (constVars.usePatternResultAsLabel) {
l.set(constVars.answerClass.get(label), label);
Set<String> matched = new HashSet<String>();
matched.add(StringUtils.join(ph, " "));
l.set(PatternsAnnotations.MatchedPhrases.class, matched);
for (int k = Math.max(0, index - constVars.numWordsCompound); k < sentEn.getValue().size()
&& k <= index + constVars.numWordsCompound + 1; k++) {
contextWordsRecalculatePats.add(k);
}
}
}
}
}
}
if (patternsForEachToken != null && patternsForEachToken.containsKey(sentEn.getKey())) {
for (int index : contextWordsRecalculatePats)
this.patternsForEachToken.get(sentEn.getKey()).put(index, createPats.getContext(sentEn.getValue(), index));
}
}
if (outFile != null) {
Redwood.log(ConstantsAndVariables.minimaldebug, "Writing results to " + outFile);
IOUtils.writeObjectToFile(sents, outFile);
}
}
public void iterateExtractApply(Map<String, SurfacePattern> p0, Map<String, Counter<String>> p0Set, String wordsOutputFile, String sentsOutFile,
String patternsOutFile, Map<String, Set<SurfacePattern>> ignorePatterns) throws ClassNotFoundException, IOException, InterruptedException,
ExecutionException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException,
SecurityException {
Map<String, CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>>> matchedTokensByPatAllLabels = new HashMap<String, CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>>>();
Map<String, TwoDimensionalCounter<String, SurfacePattern>> termsAllLabels = new HashMap<String, TwoDimensionalCounter<String, SurfacePattern>>();
Map<String, Set<String>> ignoreWordsAll = new HashMap<String, Set<String>>();
for (String label : constVars.getLabelDictionary().keySet()) {
matchedTokensByPatAllLabels.put(label, new CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>>());
termsAllLabels.put(label, new TwoDimensionalCounter<String, SurfacePattern>());
if (constVars.useOtherLabelsWordsasNegative) {
Set<String> w = new HashSet<String>();
for (Entry<String, Set<String>> en : constVars.getLabelDictionary().entrySet()) {
if (en.getKey().equals(label))
continue;
w.addAll(en.getValue());
}
ignoreWordsAll.put(label, w);
}
}
Redwood.log(ConstantsAndVariables.minimaldebug, "Iterating " + constVars.numIterationsForPatterns + " times.");
Map<String, BufferedWriter> wordsOutput = new HashMap<String, BufferedWriter>();
Map<String, BufferedWriter> patternsOutput = new HashMap<String, BufferedWriter>();
for (String label : constVars.getLabelDictionary().keySet()) {
IOUtils.ensureDir(new File(constVars.outDir + "/" + constVars.identifier + "/" + label));
String wordsOutputFileLabel = wordsOutputFile + "_" + label;
if (wordsOutputFile == null)
wordsOutputFileLabel = constVars.outDir + "/" + constVars.identifier + "/" + label + "/learnedwords.txt";
wordsOutput.put(label, new BufferedWriter(new FileWriter(wordsOutputFileLabel)));
Redwood.log(ConstantsAndVariables.minimaldebug, "Saving the learned words for label " + label + " in " + wordsOutputFileLabel);
String patternsOutputFileLabel = patternsOutFile + "_" + label;
if (patternsOutFile == null)
patternsOutputFileLabel = constVars.outDir + "/" + constVars.identifier + "/" + label + "/learnedpatterns.txt";
patternsOutput.put(label, new BufferedWriter(new FileWriter(patternsOutputFileLabel)));
Redwood.log(ConstantsAndVariables.minimaldebug, "Saving the learned patterns for label " + label + " in " + patternsOutputFileLabel);
}
for (int i = 0; i < constVars.numIterationsForPatterns; i++) {
Redwood
.log(ConstantsAndVariables.minimaldebug, "\n\n################################ Iteration " + (i + 1) + " ##############################");
boolean keepRunning = false;
Map<String, Counter<String>> learnedWordsThisIter = new HashMap<String, Counter<String>>();
for (String label : constVars.getLabelDictionary().keySet()) {
Redwood.log(ConstantsAndVariables.minimaldebug, "\n###Learning for label " + label + " ######");
String sentout = sentsOutFile == null ? null : sentsOutFile + "_" + label;
Pair<Counter<SurfacePattern>, Counter<String>> learnedPatWords4label = iterateExtractApply4Label(label, p0 != null ? p0.get(label) : null,
p0Set != null ? p0Set.get(label) : null, wordsOutput.get(label), sentout, patternsOutput.get(label),
ignorePatterns != null ? ignorePatterns.get(label) : null, 1, ignoreWordsAll.get(label), matchedTokensByPatAllLabels.get(label),
termsAllLabels.get(label));
learnedWordsThisIter.put(label, learnedPatWords4label.second());
if (learnedPatWords4label.first().size() > 0) {
keepRunning = true;
}
}
if (constVars.useOtherLabelsWordsasNegative) {
for (String label : constVars.getLabelDictionary().keySet()) {
for (Entry<String, Counter<String>> en : learnedWordsThisIter.entrySet()) {
if (en.getKey().equals(label))
continue;
ignoreWordsAll.get(label).addAll(en.getValue().keySet());
}
}
}
if (!keepRunning) {
if (!constVars.tuneThresholdKeepRunning) {
Redwood.log(ConstantsAndVariables.minimaldebug, "No patterns learned for all labels. Ending iterations.");
break;
} else {
constVars.thresholdSelectPattern = 0.8 * constVars.thresholdSelectPattern;
Redwood.log(ConstantsAndVariables.minimaldebug, "\n\nTuning thresholds to keep running. New Pattern threshold is "
+ constVars.thresholdSelectPattern);
}
}
}
if (constVars.outDir != null && !constVars.outDir.isEmpty()) {
Redwood.log(ConstantsAndVariables.minimaldebug, "Writing justification files");
Set<String> allMatchedSents = new HashSet<String>();
for (String label : constVars.getLabelDictionary().keySet()) {
CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>> tokensMatchedPat = matchedTokensByPatAllLabels.get(label);
IOUtils.ensureDir(new File(constVars.outDir + "/" + constVars.identifier + "/" + label));
if (constVars.writeMatchedTokensFiles) {
String matchedtokensfilename = constVars.outDir + "/" + constVars.identifier + "/" + label + "/tokensmatchedpatterns" + ".json";
JsonObjectBuilder pats = Json.createObjectBuilder();
for (Entry<SurfacePattern, Collection<Triple<String, Integer, Integer>>> en : tokensMatchedPat.entrySet()) {
CollectionValuedMap<String, Pair<Integer, Integer>> matchedStrs = new CollectionValuedMap<String, Pair<Integer, Integer>>();
for (Triple<String, Integer, Integer> en2 : en.getValue()) {
allMatchedSents.add(en2.first());
matchedStrs.add(en2.first(), new Pair<Integer, Integer>(en2.second(), en2.third()));
}
JsonObjectBuilder senttokens = Json.createObjectBuilder();
for (Entry<String, Collection<Pair<Integer, Integer>>> sen : matchedStrs.entrySet()) {
JsonArrayBuilder obj = Json.createArrayBuilder();
for (Pair<Integer, Integer> sen2 : sen.getValue()) {
JsonArrayBuilder startend = Json.createArrayBuilder();
startend.add(sen2.first());
startend.add(sen2.second());
obj.add(startend);
}
senttokens.add(sen.getKey(), obj);
}
pats.add(en.getKey().toStringSimple(), senttokens);
}
IOUtils.writeStringToFile(pats.build().toString(), matchedtokensfilename, "utf8");
// Writing the sentence json file -- tokens for each sentence
JsonObjectBuilder senttokens = Json.createObjectBuilder();
for (String sentId : allMatchedSents) {
JsonArrayBuilder sent = Json.createArrayBuilder();
for (CoreLabel l : Data.sents.get(sentId)) {
sent.add(l.word());
}
senttokens.add(sentId, sent);
}
String sentfilename = constVars.outDir + "/" + constVars.identifier + "/sentences" + ".json";
IOUtils.writeStringToFile(senttokens.build().toString(), sentfilename, "utf8");
}
}
}
System.out.println("\n\nAll patterns learned:");
for (Entry<String, Counter<SurfacePattern>> en : this.learnedPatterns.entrySet()) {
System.out.println(en.getKey() + ":\t\t" + StringUtils.join(en.getValue().keySet(), "\n") + "\n\n");
}
System.out.println("\n\nAll words learned:");
for (Entry<String, Counter<String>> en : this.learnedWords.entrySet()) {
System.out.println(en.getKey() + ":\t\t" + en.getValue().keySet() + "\n\n");
}
// close all the writers
for (String label : constVars.getLabelDictionary().keySet()) {
wordsOutput.get(label).close();
patternsOutput.get(label).close();
}
}
public Pair<Counter<SurfacePattern>, Counter<String>> iterateExtractApply4Label(String label, SurfacePattern p0, Counter<String> p0Set,
BufferedWriter wordsOutput, String sentsOutFile, BufferedWriter patternsOut, Set<SurfacePattern> ignorePatterns, int numIter,
Set<String> ignoreWords, CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>> matchedTokensByPat,
TwoDimensionalCounter<String, SurfacePattern> terms) throws IOException, InterruptedException, ExecutionException, ClassNotFoundException,
InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
if (!learnedPatterns.containsKey(label)) {
learnedPatterns.put(label, new ClassicCounter<SurfacePattern>());
}
if (!learnedWords.containsKey(label)) {
learnedWords.put(label, new ClassicCounter<String>());
}
Counter<String> identifiedWords = new ClassicCounter<String>();
Counter<SurfacePattern> patterns = new ClassicCounter<SurfacePattern>();
for (int i = 0; i < numIter; i++) {
patterns.addAll(getPatterns(label, learnedPatterns.get(label).keySet(), p0, p0Set, ignorePatterns));
learnedPatterns.get(label).addAll(patterns);
if (sentsOutFile != null)
sentsOutFile = sentsOutFile + "_" + i + "iter.ser";
Counter<String> scoreForAllWordsThisIteration = new ClassicCounter<String>();
identifiedWords.addAll(scorePhrases.learnNewPhrases(label, this.patternsForEachToken, patterns, learnedPatterns.get(label), matchedTokensByPat,
scoreForAllWordsThisIteration, terms, wordsPatExtracted.get(label), currentPatternWeights.get(label), this.patternsandWords.get(label),
this.allPatternsandWords.get(label), constVars.identifier, ignoreWords));
if (identifiedWords.size() > 0) {
if (constVars.usePatternResultAsLabel) {
if (constVars.getLabelDictionary().containsKey(label)) {
if (constVars.batchProcessSents) {
for (File f : Data.sentsFiles) {
Redwood.log(Redwood.DBG, "labeling sentences from " + f);
Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
labelWords(label, sents, identifiedWords.keySet(), patterns.keySet(), sentsOutFile, matchedTokensByPat);
IOUtils.writeObjectToFile(sents, f);
}
} else
labelWords(label, Data.sents, identifiedWords.keySet(), patterns.keySet(), sentsOutFile, matchedTokensByPat);
} else
throw new RuntimeException("why is the answer label null?");
learnedWords.get(label).addAll(identifiedWords);
}
if (wordsOutput != null) {
// if (i > 0)
// wordsOutput.write("\n");
// wordsOutput.write("\n#Iteration " + (i + 1) + "\n");
wordsOutput.write("\n" + Counters.toSortedString(identifiedWords, identifiedWords.size(), "%1$s", "\n"));
wordsOutput.flush();
}
}
if (patterns.size() == 0 && identifiedWords.size() == 0) {
if (learnedWords.get(label).size() >= constVars.maxExtractNumWords) {
System.out.println("Ending because no new words identified and total words learned till now >= max words " + constVars.maxExtractNumWords);
break;
}
if (constVars.tuneThresholdKeepRunning) {
constVars.thresholdSelectPattern = 0.8 * constVars.thresholdSelectPattern;
System.out.println("\n\nTuning thresholds to keep running. New Pattern threshold is " + constVars.thresholdSelectPattern);
} else
break;
}
}
if (patternsOut != null)
this.writePatternsToFile(learnedPatterns.get(label), patternsOut);
return new Pair<Counter<SurfacePattern>, Counter<String>>(patterns, identifiedWords);
}
void writePatternsToFile(Counter<SurfacePattern> pattern, BufferedWriter outFile) throws IOException {
for (Entry<SurfacePattern, Double> en : pattern.entrySet())
outFile.write(en.getKey().toString() + "\t" + en.getValue() + "\n");
}
void writeWordsToFile(Counter<String> words, BufferedWriter outFile) throws IOException {
for (Entry<String, Double> en : words.entrySet())
outFile.write(en.getKey() + "\t" + en.getValue() + "\n");
}
Counter<String> readLearnedWordsFromFile(File file) {
Counter<String> words = new ClassicCounter<String>();
for (String line : IOUtils.readLines(file)) {
String[] t = line.split("\t");
words.setCount(t[0], Double.parseDouble(t[1]));
}
return words;
}
public Counter<String> getLearnedWords(String label) {
return this.learnedWords.get(label);
}
public Counter<SurfacePattern> getLearnedPatterns(String label) {
return this.learnedPatterns.get(label);
}
public void setLearnedWords(Counter<String> words, String label) {
this.learnedWords.put(label, words);
}
public void setLearnedPatterns(Counter<SurfacePattern> patterns, String label) {
this.learnedPatterns.put(label, patterns);
}
/**
* COPIED from CRFClassifier: Count the successes and failures of the model on
* the given document. Fills numbers in to counters for true positives, false
* positives, and false negatives, and also keeps track of the entities seen. <br>
* Returns false if we ever encounter null for gold or guess. NOTE: The
* current implementation of counting wordFN/FP is incorrect.
*/
public static boolean countResultsPerEntity(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN,
String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN,
Class<? extends TypesafeMap.Key<String>> whichClassToCompare) {
int index = 0;
int goldIndex = 0, guessIndex = 0;
String lastGold = background, lastGuess = background;
// As we go through the document, there are two events we might be
// interested in. One is when a gold entity ends, and the other
// is when a guessed entity ends. If the gold and guessed
// entities end at the same time, started at the same time, and
// match entity type, we have a true positive. Otherwise we
// either have a false positive or a false negative.
String str = "";
String s = "";
for (CoreLabel l : doc) {
s += " " + l.word() + ":" + l.get(CoreAnnotations.GoldAnswerAnnotation.class) + ":" + l.get(whichClassToCompare);
}
for (CoreLabel line : doc) {
String gold = line.get(CoreAnnotations.GoldAnswerAnnotation.class);
String guess = line.get(whichClassToCompare);
if (gold == null || guess == null)
return false;
if (lastGold != null && !lastGold.equals(gold) && !lastGold.equals(background)) {
if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex) {
wordTP.incrementCount(str);
entityTP.incrementCount(lastGold, 1.0);
} else {
// System.out.println("false negative: " + str);
wordFN.incrementCount(str);
entityFN.incrementCount(lastGold, 1.0);
str = "";
}
}
if (lastGuess != null && !lastGuess.equals(guess) && !lastGuess.equals(background)) {
if (lastGuess.equals(lastGold) && !lastGuess.equals(guess) && goldIndex == guessIndex && !lastGold.equals(gold)) {
// correct guesses already tallied
// str = "";
// only need to tally false positives
} else {
// System.out.println("false positive: " + str);
entityFP.incrementCount(lastGuess, 1.0);
wordFP.incrementCount(str);
}
str = "";
}
if (lastGuess != null && lastGold != null && lastGold.equals(background) && lastGuess.equals(background)) {
str = "";
}
if (lastGold == null || !lastGold.equals(gold)) {
lastGold = gold;
goldIndex = index;
}
if (lastGuess == null || !lastGuess.equals(guess)) {
lastGuess = guess;
guessIndex = index;
}
++index;
if (str.isEmpty())
str = line.word();
else
str += " " + line.word();
}
// We also have to account for entities at the very end of the
// document, since the above logic only occurs when we see
// something that tells us an entity has ended
if (lastGold != null && !lastGold.equals(background)) {
if (lastGold.equals(lastGuess) && goldIndex == guessIndex) {
entityTP.incrementCount(lastGold, 1.0);
wordTP.incrementCount(str);
} else {
entityFN.incrementCount(lastGold, 1.0);
wordFN.incrementCount(str);
}
str = "";
}
if (lastGuess != null && !lastGuess.equals(background)) {
if (lastGold.equals(lastGuess) && goldIndex == guessIndex) {
// correct guesses already tallied
} else {
entityFP.incrementCount(lastGuess, 1.0);
wordFP.incrementCount(str);
}
str = "";
}
return true;
}
/**
* Count the successes and failures of the model on the given document
* ***token-based***. Fills numbers in to counters for true positives, false
* positives, and false negatives, and also keeps track of the entities seen. <br>
* Returns false if we ever encounter null for gold or guess.
*
* this currently is only for testing one label at a time
*/
public static void countResultsPerToken(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN,
String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN,
Class<? extends TypesafeMap.Key<String>> whichClassToCompare) {
CRFClassifier.countResults(doc, entityTP, entityFP, entityFN, background);
// int index = 0;
// int goldIndex = 0, guessIndex = 0;
// String lastGold = background, lastGuess = background;
// As we go through the document, there are two events we might be
// interested in. One is when a gold entity ends, and the other
// is when a guessed entity ends. If the gold and guessed
// entities end at the same time, started at the same time, and
// match entity type, we have a true positive. Otherwise we
// either have a false positive or a false negative.
for (CoreLabel line : doc) {
String gold = line.get(GoldAnswerAnnotation.class);
String guess = line.get(whichClassToCompare);
if (gold == null || guess == null)
throw new RuntimeException("why is gold or guess null?");
if (gold.equals(guess) && !gold.equalsIgnoreCase(background)) {
entityTP.incrementCount(gold);
wordTP.incrementCount(line.word());
} else if (!gold.equals(guess) && !gold.equalsIgnoreCase(background) && guess.equalsIgnoreCase(background)) {
entityFN.incrementCount(gold);
wordFN.incrementCount(line.word());
} else if (!gold.equals(guess) && !guess.equalsIgnoreCase(background) && gold.equalsIgnoreCase(background)) {
wordFP.incrementCount(line.word());
entityFP.incrementCount(guess);
} else if (gold.equals(guess) && !gold.equalsIgnoreCase(background)) {
wordTN.incrementCount(line.word());
} else if (!(gold.equalsIgnoreCase(background) && guess.equalsIgnoreCase(background)))
throw new RuntimeException("don't know reached here. not meant for more than one entity label");
}
}
public static void countResults(List<CoreLabel> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN,
String background, Counter<String> wordTP, Counter<String> wordTN, Counter<String> wordFP, Counter<String> wordFN,
Class<? extends TypesafeMap.Key<String>> whichClassToCompare, boolean evalPerEntity) {
if (evalPerEntity) {
countResultsPerEntity(doc, entityTP, entityFP, entityFN, background, wordTP, wordTN, wordFP, wordFN, whichClassToCompare);
} else {
countResultsPerToken(doc, entityTP, entityFP, entityFN, background, wordTP, wordTN, wordFP, wordFN, whichClassToCompare);
}
}
private void writeLabelDataSents(Map<String, List<CoreLabel>> sents, BufferedWriter writer) throws IOException {
for (Entry<String, List<CoreLabel>> sent : sents.entrySet()) {
writer.write(sent.getKey() + "\t");
Map<String, Boolean> lastWordLabeled = new HashMap<String, Boolean>();
for (String label : constVars.getLabelDictionary().keySet()) {
lastWordLabeled.put(label, false);
}
for (CoreLabel s : sent.getValue()) {
String str = "";
//write them in reverse order
List<String> listEndedLabels = new ArrayList<String>();
//to first finish labels before starting
List<String> startingLabels = new ArrayList<String>();
for (Entry<String, Class<? extends TypesafeMap.Key<String>>> as : constVars.answerClass.entrySet()) {
String label = as.getKey();
boolean lastwordlabeled = lastWordLabeled.get(label);
if (s.get(as.getValue()).equals(label)) {
if (!lastwordlabeled) {
startingLabels.add(label);
}
lastWordLabeled.put(label, true);
} else {
if (lastwordlabeled) {
listEndedLabels.add(label);
}
lastWordLabeled.put(label, false);
}
}
for(int i = listEndedLabels.size() -1 ; i >=0; i--)
str += " </" + listEndedLabels.get(i) + ">";
for(String label : startingLabels){
str += " <" + label + "> ";
}
str += " " + s.word();
writer.write(str.trim() + " ");
}
writer.write("\n");
}
}
public void writeLabeledData(String outFile) throws IOException, ClassNotFoundException {
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
if (!constVars.batchProcessSents) {
this.writeLabelDataSents(Data.sents, writer);
} else {
for (File f : Data.sentsFiles) {
Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
this.writeLabelDataSents(sents, writer);
}
}
writer.close();
}
// public Map<String, List<CoreLabel>> loadJavaNLPAnnotatorLabeledFile(String
// labeledFile, Properties props) throws FileNotFoundException {
// System.out.println("Loading evaluate file " + labeledFile);
// Map<String, List<CoreLabel>> sents = new HashMap<String,
// List<CoreLabel>>();
// JavaNLPAnnotatorReaderAndWriter j = new JavaNLPAnnotatorReaderAndWriter();
// j.init(props);
// Iterator<List<CoreLabel>> iter = j.getIterator(new BufferedReader(new
// FileReader(labeledFile)));
// int i = 0;
// while (iter.hasNext()) {
// i++;
// List<CoreLabel> s = iter.next();
// String id = s.get(0).get(CoreAnnotations.DocIDAnnotation.class);
// if (id == null) {
// id = Integer.toString(i);
// }
// sents.put(id, s);
// }
// System.out.println("Read " + sents.size() + " eval sentences");
// return sents;
// }
// private void evaluate(String label, Map<String, List<CoreLabel>> sents)
// throws IOException, InterruptedException, ExecutionException {
// Redwood.log(Redwood.DBG, "labeling " + learnedWords.get(label));
// CollectionValuedMap<String, Integer> tokensMatchedPatterns = new
// CollectionValuedMap<String, Integer>();
//
// if (restrictToMatched) {
// if (!alreadySetUp)
// setUp();
// List<String> keyset = new ArrayList<String>(sents.keySet());
// int num = 0;
// if (constVars.numThreads == 1)
// num = keyset.size();
// else
// num = keyset.size() / (constVars.numThreads - 1);
// ExecutorService executor = Executors
// .newFixedThreadPool(constVars.numThreads);
// // Redwood.log(ConstantsAndVariables.minimaldebug, "keyset size is " +
// // keyset.size());
// List<Future<Pair<TwoDimensionalCounter<Pair<String, String>,
// SurfacePattern>, CollectionValuedMap<String, Integer>>>> list = new
// ArrayList<Future<Pair<TwoDimensionalCounter<Pair<String, String>,
// SurfacePattern>, CollectionValuedMap<String, Integer>>>>();
// for (int i = 0; i < constVars.numThreads; i++) {
// // Redwood.log(ConstantsAndVariables.minimaldebug, "assigning from " + i *
// // num + " till " + Math.min(keyset.size(), (i + 1) * num));
//
// Callable<Pair<TwoDimensionalCounter<Pair<String, String>, SurfacePattern>,
// CollectionValuedMap<String, Integer>>> task = null;
// task = new ApplyPatterns(keyset.subList(i * num,
// Math.min(keyset.size(), (i + 1) * num)),
// this.learnedPatterns.get(label), constVars.commonEngWords,
// usePatternResultAsLabel, this.learnedWords.get(label).keySet(),
// restrictToMatched, label,
// constVars.removeStopWordsFromSelectedPhrases,
// constVars.removePhrasesWithStopWords, constVars);
// Future<Pair<TwoDimensionalCounter<Pair<String, String>, SurfacePattern>,
// CollectionValuedMap<String, Integer>>> submit = executor
// .submit(task);
// list.add(submit);
// }
// for (Future<Pair<TwoDimensionalCounter<Pair<String, String>,
// SurfacePattern>, CollectionValuedMap<String, Integer>>> future : list) {
// Pair<TwoDimensionalCounter<Pair<String, String>, SurfacePattern>,
// CollectionValuedMap<String, Integer>> res = future
// .get();
// tokensMatchedPatterns.addAll(res.second());
// }
// executor.shutdown();
// }
//
// this.labelWords(label, sents, this.learnedWords.get(label).keySet(),
// this.learnedPatterns.get(label).keySet(), null, tokensMatchedPatterns);
// Counter<String> entityTP = new ClassicCounter<String>();
// Counter<String> entityFP = new ClassicCounter<String>();
// Counter<String> entityFN = new ClassicCounter<String>();
// for (Entry<String, List<CoreLabel>> sent : sents.entrySet()) {
// for (CoreLabel l : sent.getValue()) {
// if (l.containsKey(constVars.answerClass.get(label))
// && l.get(constVars.answerClass.get(label)) != null)
// l.set(CoreAnnotations.AnswerAnnotation.class,
// l.get(constVars.answerClass.get(label)).toString());
// if (!l.containsKey(CoreAnnotations.AnswerAnnotation.class)
// || l.get(CoreAnnotations.AnswerAnnotation.class) == null) {
// l.set(CoreAnnotations.AnswerAnnotation.class,
// SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
//
// }
//
// }
// CRFClassifier.countResults(sent.getValue(), entityTP, entityFP, entityFN,
// SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
// }
//
// Counter<String> precision = Counters.division(entityTP,
// Counters.add(entityTP, entityFP));
// Counter<String> recall = Counters.division(entityTP,
// Counters.add(entityTP, entityFN));
// Counter<String> fscore = Counters.getFCounter(precision, recall, 1.0);
// System.out.println("Precision: " + precision);
// System.out.println("Recall: " + recall);
// System.out.println("FScore: " + fscore);
// }
public void evaluate(Map<String, List<CoreLabel>> testSentences, boolean evalPerEntity) throws IOException {
for (Entry<String, Class<? extends Key<String>>> anscl : constVars.answerClass.entrySet()) {
String label = anscl.getKey();
Counter<String> entityTP = new ClassicCounter<String>();
Counter<String> entityFP = new ClassicCounter<String>();
Counter<String> entityFN = new ClassicCounter<String>();
Counter<String> wordTP = new ClassicCounter<String>();
Counter<String> wordTN = new ClassicCounter<String>();
Counter<String> wordFP = new ClassicCounter<String>();
Counter<String> wordFN = new ClassicCounter<String>();
for (Entry<String, List<CoreLabel>> docEn : testSentences.entrySet()) {
List<CoreLabel> doc = docEn.getValue();
List<CoreLabel> doceval = new ArrayList<CoreLabel>();
for (CoreLabel l : doc) {
CoreLabel l2 = new CoreLabel();
l2.setWord(l.word());
if (l.get(anscl.getValue()).equals(label)) {
l2.set(CoreAnnotations.AnswerAnnotation.class, label);
} else
l2.set(CoreAnnotations.AnswerAnnotation.class, constVars.backgroundSymbol);
// If the gold label is not the label we are calculating the scores
// for, set it to the background symbol
if (!l.get(CoreAnnotations.GoldAnswerAnnotation.class).equals(label)) {
l2.set(CoreAnnotations.GoldAnswerAnnotation.class, constVars.backgroundSymbol);
} else
l2.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
doceval.add(l2);
}
countResults(doceval, entityTP, entityFP, entityFN, constVars.backgroundSymbol, wordTP, wordTN, wordFP, wordFN,
CoreAnnotations.AnswerAnnotation.class, evalPerEntity); //
}
System.out.println("False Positives: " + Counters.toSortedString(wordFP, wordFP.size(), "%s:%.2f", ";"));
System.out.println("False Negatives: " + Counters.toSortedString(wordFN, wordFN.size(), "%s:%.2f", ";"));
Redwood.log(Redwood.DBG, "\nFor label " + label + " True Positives: " + entityTP + "\tFalse Positives: " + entityFP + "\tFalse Negatives: "
+ entityFN);
Counter<String> precision = Counters.division(entityTP, Counters.add(entityTP, entityFP));
Counter<String> recall = Counters.division(entityTP, Counters.add(entityTP, entityFN));
Redwood.log(ConstantsAndVariables.minimaldebug, "\nFor label " + label + " Precision: " + precision + ", Recall: " + recall + ", F1 score: "
+ FScore(precision, recall, 1));
// Redwood.log(ConstantsAndVariables.minimaldebug, "Total: " +
// Counters.add(entityFP, entityTP));
}
}
public static <D> Counter<D> FScore(Counter<D> precision, Counter<D> recall, double beta) {
double betasq = beta * beta;
return Counters.divisionNonNaN(Counters.scale(Counters.product(precision, recall), (1 + betasq)),
(Counters.add(Counters.scale(precision, betasq), recall)));
}
public static List<File> getAllFiles(String file) {
List<File> allFiles = new ArrayList<File>();
for (String tokfile : file.split("[,;]")) {
File filef = new File(tokfile);
if (filef.isDirectory()) {
String path = ".*";
File dir = filef;
for (File f : IOUtils.iterFilesRecursive(dir, Pattern.compile(path))) {
Redwood.log(Redwood.DBG, "Reading file " + f);
allFiles.add(f);
}
} else {
Redwood.log(Redwood.DBG, "Reading file " + filef);
allFiles.add(filef);
}
// RegExFileFilter fileFilter = new RegExFileFilter(Pattern.compile(ext));
// File[] files = dir.listFiles(fileFilter);
}
return allFiles;
}
private Pair<Double, Double> getPrecisionRecall(String label, Map<String, Boolean> goldWords4Label) {
Set<String> learnedWords = getLearnedWords(label).keySet();
int numcorrect = 0, numincorrect = 0;
int numgoldcorrect = 0;
for (Entry<String, Boolean> en : goldWords4Label.entrySet()) {
if (en.getValue())
numgoldcorrect++;
}
Set<String> assumedNeg = new HashSet<String>();
for (String e : learnedWords) {
if (!goldWords4Label.containsKey(e)) {
assumedNeg.add(e);
numincorrect++;
continue;
}
if (goldWords4Label.get(e)) {
numcorrect++;
} else
numincorrect++;
}
if (!assumedNeg.isEmpty())
System.err.println("Gold entity list does not contain words " + assumedNeg + " for label " + label + ". Assuming them as negative.");
double precision = numcorrect / (double) (numcorrect + numincorrect);
double recall = numcorrect / (double) (numgoldcorrect);
return new Pair<Double, Double>(precision, recall);
}
public double FScore(double precision, double recall, double beta) {
double betasq = beta * beta;
return (1 + betasq) * precision * recall / (betasq * precision + recall);
}
@SuppressWarnings({ "rawtypes" })
public static void main(String[] args) {
try {
Properties props = StringUtils.argsToPropertiesWithResolve(args);
Map<String, Set<SurfacePattern>> ignorePatterns = new HashMap<String, Set<SurfacePattern>>();
Map<String, SurfacePattern> p0 = new HashMap<String, SurfacePattern>();
Map<String, Counter<String>> p0Set = new HashMap<String, Counter<String>>();
String fileFormat = props.getProperty("fileFormat");
Map<String, Set<String>> seedWords = new HashMap<String, Set<String>>();
String seedWordsFiles = props.getProperty("seedWordsFiles");
if (seedWordsFiles == null) {
throw new RuntimeException(
"Needs both seedWordsFiles and file parameters to run this class!\nseedWordsFiles has format: label1,filewithlistofwords1;label2,filewithlistofwords2;...");
}
for (String seedFile : seedWordsFiles.split(";")) {
String[] t = seedFile.split(",");
String label = t[0];
String seedWordsFile = t[1];
Set<String> seedWords4Label = new HashSet<String>();
for (String line : IOUtils.readLines(seedWordsFile)) {
line = line.trim();
if (line.isEmpty() || line.startsWith("#")) {
continue;
}
seedWords4Label.add(line);
}
seedWords.put(label, seedWords4Label);
Redwood.log(ConstantsAndVariables.minimaldebug, "Number of seed words for label " + label + " is " + seedWords4Label.size());
}
Map<String, Class> answerClasses = new HashMap<String, Class>();
String ansClasses = props.getProperty("answerClasses");
if (ansClasses != null) {
for (String l : ansClasses.split(";")) {
String[] t = l.split(",");
String label = t[0];
String cl = t[1];
Class answerClass = ClassLoader.getSystemClassLoader().loadClass(cl);
answerClasses.put(label, answerClass);
}
}
Map<String, List<CoreLabel>> sents = null;
boolean batchProcessSents = Boolean.parseBoolean(props.getProperty("batchProcessSents", "false"));
int numMaxSentencesPerBatchFile = Integer.parseInt(props.getProperty("numMaxSentencesPerBatchFile", String.valueOf(Integer.MAX_VALUE)));
if (!batchProcessSents)
sents = new HashMap<String, List<CoreLabel>>();
else
Data.sentsFiles = new ArrayList<File>();
String file = props.getProperty("file");
String posModelPath = props.getProperty("posModelPath");
boolean lowercase = Boolean.parseBoolean(props.getProperty("lowercaseText"));
boolean useTargetNERRestriction = Boolean.parseBoolean(props.getProperty("useTargetNERRestriction"));
boolean useTargetParserParentRestriction = Boolean.parseBoolean(props.getProperty("useTargetParserParentRestriction"));
boolean useContextNERRestriction = Boolean.parseBoolean(props.getProperty("useContextNERRestriction"));
boolean evaluate = Boolean.parseBoolean(props.getProperty("evaluate"));
boolean addEvalSentsToTrain = Boolean.parseBoolean(props.getProperty("addEvalSentsToTrain"));
String evalFileWithGoldLabels = props.getProperty("evalFileWithGoldLabels");
if (file == null && (evalFileWithGoldLabels == null || addEvalSentsToTrain == false)) {
throw new RuntimeException("No training data! file is " + file + " and evalFileWithGoldLabels is " + evalFileWithGoldLabels
+ " and addEvalSentsToTrain is " + addEvalSentsToTrain);
}
String saveSentencesSerDir = null;
boolean usingDirForSentsInIndex = true;
// Read training file
if (file != null) {
saveSentencesSerDir = props.getProperty("saveSentencesSerDir");
File saveSentencesSerDirFile = null;
if (saveSentencesSerDir != null) {
saveSentencesSerDirFile = new File(saveSentencesSerDir);
IOUtils.ensureDir(saveSentencesSerDirFile);
IOUtils.writeObjectToFile(sents, saveSentencesSerDir + "/sents_all.ser");
} else {
String systemdir = System.getProperty("java.io.tmpdir");
saveSentencesSerDirFile = File.createTempFile("sents", ".tmp", new File(systemdir));
saveSentencesSerDirFile.deleteOnExit();
saveSentencesSerDir = saveSentencesSerDirFile.getAbsolutePath();
saveSentencesSerDirFile.delete();
saveSentencesSerDirFile.mkdir();
}
List<File> allFiles = GetPatternsFromDataMultiClass.getAllFiles(file);
int numFilesTillNow = 0;
if (fileFormat == null || fileFormat.equalsIgnoreCase("text") || fileFormat.equalsIgnoreCase("txt")) {
Map<String, List<CoreLabel>> sentsthis = new HashMap<String, List<CoreLabel>>();
for (File f : allFiles) {
Redwood.log(Redwood.DBG, "Annotating text in " + f);
String text = IOUtils.stringFromFile(f.getAbsolutePath());
numFilesTillNow = tokenize(text, posModelPath, lowercase, useTargetNERRestriction || useContextNERRestriction, f.getName() + "-",
useTargetParserParentRestriction, props.getProperty("numThreads"), batchProcessSents, numMaxSentencesPerBatchFile,
saveSentencesSerDirFile, sentsthis, numFilesTillNow);
if (!batchProcessSents) {
sents.putAll(sentsthis);
}
}
if (!batchProcessSents) {
IOUtils.writeObjectToFile(sents, saveSentencesSerDirFile + "/sents_" + numFilesTillNow);
}
} else if (fileFormat.equalsIgnoreCase("ser")) {
usingDirForSentsInIndex = false;
for (File f : allFiles) {
if (!batchProcessSents)
sents.putAll((Map<String, List<CoreLabel>>) IOUtils.readObjectFromFile(f));
else{
File newf = new File(saveSentencesSerDir + "/" + f.getAbsolutePath().replaceAll(Pattern.quote("/"), "_"));
IOUtils.cp(f, newf);
Data.sentsFiles.add(newf);
}
}
} else {
throw new RuntimeException(
"Cannot identify the file format. Valid values are text (or txt) and ser, where the serialized file is of the type Map<String, List<CoreLabel>>.");
}
}
Map<String, List<CoreLabel>> evalsents = new HashMap<String, List<CoreLabel>>();
File saveEvalSentencesSerFileFile = null;
// Read Evaluation File
if (evaluate) {
if (evalFileWithGoldLabels != null) {
String saveEvalSentencesSerFile = props.getProperty("saveEvalSentencesSerFile");
if (saveEvalSentencesSerFile == null) {
String systemdir = System.getProperty("java.io.tmpdir");
saveEvalSentencesSerFileFile = File.createTempFile("evalsents", ".tmp", new File(systemdir));
} else
saveEvalSentencesSerFileFile = new File(saveEvalSentencesSerFile);
Map setClassForTheseLabels = new HashMap<String, Class>();
boolean splitOnPunct = Boolean.parseBoolean(props.getProperty("splitOnPunct", "true"));
List<File> allFiles = GetPatternsFromDataMultiClass.getAllFiles(evalFileWithGoldLabels);
int numFile = 0;
String evalFileFormat = props.getProperty("evalFileFormat");
if (evalFileFormat == null || evalFileFormat.equalsIgnoreCase("text") || evalFileFormat.equalsIgnoreCase("txt")) {
for (File f : allFiles) {
numFile++;
Redwood.log(Redwood.DBG, "Annotating text in " + f + ". Num file " + numFile);
List<CoreMap> sentsCMs = AnnotatedTextReader.parseFile(new BufferedReader(new FileReader(f)), seedWords.keySet(),
setClassForTheseLabels, true, splitOnPunct, lowercase, f.getName());
evalsents.putAll(runPOSNEROnTokens(sentsCMs, posModelPath, useTargetNERRestriction || useContextNERRestriction, "",
useTargetParserParentRestriction, props.getProperty("numThreads")));
}
} else if (fileFormat.equalsIgnoreCase("ser")) {
for (File f : allFiles) {
evalsents.putAll((Map<? extends String, ? extends List<CoreLabel>>) IOUtils.readObjectFromFile(f));
}
}
// if (addEvalSentsToTrain) {
Redwood.log(Redwood.DBG, "Adding " + evalsents.size() + " eval sents to the training set");
// }
IOUtils.writeObjectToFile(evalsents, saveEvalSentencesSerFileFile);
if (batchProcessSents) {
if (Data.sentsFiles == null)
Data.sentsFiles = new ArrayList<File>();
Data.sentsFiles.add(saveEvalSentencesSerFileFile);
} else
sents.putAll(evalsents);
}
}
boolean learn = Boolean.parseBoolean(props.getProperty("learn", "true"));
boolean labelUsingSeedSets = Boolean.parseBoolean(props.getProperty("labelUsingSeedSets", "true"));
GetPatternsFromDataMultiClass g = new GetPatternsFromDataMultiClass(props, sents, seedWords, labelUsingSeedSets);
g.constVars.usingDirForSentsInIndex = usingDirForSentsInIndex;
g.constVars.saveSentencesSerDir = saveSentencesSerDir;
Execution.fillOptions(g, props);
// Redwood.log(ConstantsAndVariables.minimaldebug,
// "Total number of training sentences " + Data.sents.size());
String sentsOutFile = props.getProperty("sentsOutFile");
String wordsOutputFile = props.getProperty("wordsOutputFile");
String patternOutFile = props.getProperty("patternOutFile");
// If you want to reuse patterns and words learned previously (may be on
// another dataset etc)
boolean loadSavedPatternsWordsDir = Boolean.parseBoolean(props.getProperty("loadSavedPatternsWordsDir"));
String patternsWordsDir = props.getProperty("patternsWordsDir");
if (loadSavedPatternsWordsDir) {
for (String label : g.constVars.getLabelDictionary().keySet()) {
assert (new File(patternsWordsDir + "/" + label).exists());
File patf = new File(patternsWordsDir + "/" + label + "/patterns.ser");
if (patf.exists()) {
Counter<SurfacePattern> patterns = IOUtils.readObjectFromFile(patf);
g.setLearnedPatterns(patterns, label);
Redwood.log(Redwood.DBG, "Loaded " + patterns.size() + " patterns from " + patf);
}
File wordf = new File(patternsWordsDir + "/" + label + "/phrases.txt");
if (wordf.exists()) {
Counter<String> words = g.readLearnedWordsFromFile(wordf);
g.setLearnedWords(words, label);
Redwood.log(Redwood.DBG, "Loaded " + words.size() + " from " + patf);
}
CollectionValuedMap<SurfacePattern, Triple<String, Integer, Integer>> matchedTokensByPat = null;
if (g.constVars.restrictToMatched) {
TwoDimensionalCounter<Pair<String, String>, SurfacePattern> wordsandLemmaPatExtracted = new TwoDimensionalCounter<Pair<String, String>, SurfacePattern>();
g.scorePhrases.applyPats(g.getLearnedPatterns(label), label, false, wordsandLemmaPatExtracted, matchedTokensByPat);
}
if (g.constVars.batchProcessSents) {
for (File f : Data.sentsFiles) {
Redwood.log(Redwood.DBG, "labeling sentences from " + f + " with the already learned words");
Map<String, List<CoreLabel>> sentsf = IOUtils.readObjectFromFile(f);
assert sentsf != null : "Why are sents null";
g.labelWords(label, sentsf, g.getLearnedWords(label).keySet(), g.getLearnedPatterns(label).keySet(), sentsOutFile, matchedTokensByPat);
IOUtils.writeObjectToFile(sentsf, f);
}
} else
g.labelWords(label, Data.sents, g.getLearnedWords(label).keySet(), g.getLearnedPatterns(label).keySet(), sentsOutFile, matchedTokensByPat);
}
}
if (learn)
g.iterateExtractApply(p0, p0Set, wordsOutputFile, sentsOutFile, patternOutFile, ignorePatterns);
if (g.constVars.markedOutputTextFile != null) {
g.writeLabeledData(g.constVars.markedOutputTextFile);
}
boolean savePatternsWordsDir = Boolean.parseBoolean(props.getProperty("savePatternsWordsDir"));
if (savePatternsWordsDir) {
for (String label : g.constVars.getLabelDictionary().keySet()) {
IOUtils.ensureDir(new File(patternsWordsDir + "/" + label));
IOUtils.writeObjectToFile(g.getLearnedPatterns(label), patternsWordsDir + "/" + label + "/patterns.ser");
BufferedWriter w = new BufferedWriter(new FileWriter(patternsWordsDir + "/" + label + "/phrases.txt"));
g.writeWordsToFile(g.getLearnedWords(label), w);
w.close();
}
}
if (evaluate) {
// The format of goldEntitiesEvalFiles is assumed same as
// seedwordsfiles: label,file;label2,file2;...
// Each file of gold entities consists of each entity in newline with
// incorrect entities marked with "#" at the end of the entity.
// Learned entities not present in the gold file are considered
// negative.
String goldEntitiesEvalFiles = props.getProperty("goldEntitiesEvalFiles");
if (goldEntitiesEvalFiles != null) {
for (String gfile : goldEntitiesEvalFiles.split(";")) {
String[] t = gfile.split(",");
String label = t[0];
String goldfile = t[1];
Map<String, Boolean> goldWords4Label = new HashMap<String, Boolean>();
for (String line : IOUtils.readLines(goldfile)) {
line = line.trim();
if (line.isEmpty())
continue;
if (line.endsWith("#"))
goldWords4Label.put(line.substring(0, line.length() - 1), false);
else
goldWords4Label.put(line, true);
}
Pair<Double, Double> pr = g.getPrecisionRecall(label, goldWords4Label);
Redwood.log(ConstantsAndVariables.minimaldebug,
"\nFor label " + label + ": Number of gold entities is " + goldWords4Label.size() + ", Precision is " + g.df.format(pr.first() * 100)
+ ", Recall is " + g.df.format(pr.second() * 100) + ", F1 is " + g.df.format(g.FScore(pr.first(), pr.second(), 1.0) * 100)
+ "\n\n");
}
}
if (saveEvalSentencesSerFileFile != null && saveEvalSentencesSerFileFile.exists()) {
if (batchProcessSents)
evalsents = IOUtils.readObjectFromFile(saveEvalSentencesSerFileFile);
boolean evalPerEntity = Boolean.parseBoolean(props.getProperty("evalPerEntity", "true"));
g.evaluate(evalsents, evalPerEntity);
}
if (evalsents.size() == 0 && goldEntitiesEvalFiles == null)
System.err.println("No eval sentences or list of gold entities provided to evaluate! Make sure evalFileWithGoldLabels or goldEntitiesEvalFiles is set, or turn off the evaluate flag");
}
} catch (OutOfMemoryError e) {
System.out.println("Out of memory! Either change the memory alloted by running as java -mx20g ... for example if you wanna allot 20G. Or consider using batchProcessSents and numMaxSentencesPerBatchFile flags");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
} // end main()
}